package cn.edu.bjtu.model.word2vec.domain;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map.Entry;

import cn.edu.bjtu.abstractimpl.analyzer.LuceneDocumentAnalyzer;
import cn.edu.bjtu.classimpl.parser.FileDocumentSource;
import cn.edu.bjtu.model.word2vec.Learn;
import cn.edu.bjtu.model.word2vec.util.Haffman;
import cn.edu.bjtu.model.word2vec.util.MapCount;

public class Word2Vec extends Learn {
	FileDocumentSource fds = null;
	LuceneDocumentAnalyzer lda = new LuceneDocumentAnalyzer();
	@Override
	public boolean learnFile(File file) throws IOException {
		this.fds = new FileDocumentSource(file);

		readVocab(null);
		new Haffman(layerSize).make(wordMap.values());

		// 查找每个神经元
		for (Neuron neuron : wordMap.values()) {
			((WordNeuron) neuron).makeNeurons();
		}
		this.fds = new FileDocumentSource(file);
		trainModel(null);
		return true;
	}

	protected void readVocab(File file) throws IOException {
		MapCount<String> mc = new MapCount<>();
		while(fds.hasNext()){
				String temp = lda.analyze(fds.next().getContent());
				String[] split = temp.split(" ");
				trainWordsCount += split.length;
				for (String string : split) {
					mc.add(string);
				}
		}
		for (Entry<String, Integer> element : mc.get().entrySet()) {
			wordMap.put(element.getKey(),
					new WordNeuron(element.getKey(), (double) element.getValue() / mc.size(), layerSize));
		}
	}

	protected void trainModel(File file) throws IOException {
	
			String temp = null;
			long nextRandom = 5;
			int wordCount = 0;
			int lastWordCount = 0;
			int wordCountActual = 0;
			while(fds.hasNext()){
				temp = lda.analyze(fds.next().getContent());
		
					if (wordCount - lastWordCount > 10000) {
						System.out.println("alpha:" + alpha + "\tProgress: "
								+ (int) (wordCountActual / (double) (trainWordsCount + 1) * 100) + "%");
						wordCountActual += wordCount - lastWordCount;
						lastWordCount = wordCount;
						alpha = startingAlpha * (1 - wordCountActual / (double) (trainWordsCount + 1));
						if (alpha < startingAlpha * 0.0001) {
							alpha = startingAlpha * 0.0001;
						}
					}
					String[] strs = temp.split(" ");
					wordCount += strs.length;
					List<WordNeuron> sentence = new ArrayList<WordNeuron>();
					for (int i = 0; i < strs.length; i++) {
						Neuron entry = wordMap.get(strs[i]);
						if (entry == null) {
							continue;
						}
						// The subsampling randomly discards frequent words while
						// keeping the
						// ranking same
						if (sample > 0) {
							double ran = (Math.sqrt(entry.freq / (sample * trainWordsCount)) + 1)
									* (sample * trainWordsCount) / entry.freq;
							nextRandom = nextRandom * 25214903917L + 11;
							if (ran < (nextRandom & 0xFFFF) / (double) 65536) {
								continue;
							}
						}
						sentence.add((WordNeuron) entry);
					}
	
					for (int index = 0; index < sentence.size(); index++) {
						nextRandom = nextRandom * 25214903917L + 11;
						if (isCbow) {
							cbowGram(index, sentence, (int) nextRandom % window);
						} else {
							skipGram(index, sentence, (int) nextRandom % window);
						}
					}
	
				
			}
			System.out.println("Vocab size: " + wordMap.size());
			System.out.println("Words in train file: " + trainWordsCount);
			System.out.println("sucess train over!");
		}
	
}
